import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv("../DATA/penguins_size.csv")


df.head()


df['species'].unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)


# this treats false as zero and true as one , each column and it's associated null value 
df.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64


df=df.dropna()


df.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


# Notice the dot
df['sex'].unique()

array(['MALE', 'FEMALE', '.'], dtype=object)


df[df['sex']=='.']


# we want to determine the missing sex value for the previous penguin, comparing mean on each column 
# the missing value is more likely leaning toward female 
df[df['species']=='Gentoo'].groupby('sex').describe().transpose()


# at row 336 in column sex assign it with this value 
df.at[336,'sex'] = 'FEMALE'


# we don't need to scale our features in decision tree as we're making a single feature the criterion decision 
# for the whole data set , we're not using multiple features at the same time .
# decision tree in scikit learn does'nt allow the features to be strings we need to make them numerical 
# we will use dummy variables 
X= pd.get_dummies(df.drop('species',axis=1),drop_first=True)


y= df['species']


from sklearn.model_selection import train_test_split


x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=101)


from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree 
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix


def report_model(model):
    model_preds = model.predict(x_test)
    print(classification_report(y_test,model_preds))
    print('\n')
    plt.figure(figsize=(12,8),dpi=150)
    plot_tree(model,filled=True,feature_names=X.columns);


base_model=DecisionTreeClassifier()
base_model.fit(x_train,y_train)

DecisionTreeClassifier()


base_model.feature_importances_

array([0.32352044, 0.05366774, 0.542054  , 0.00239775, 0.07836008,
       0.        , 0.        ])


#an output for analysis after the tree has been constructed. to see the overall contribution of each feature in the tree
pd.DataFrame(index=X.columns,data=base_model.feature_importances_,columns=['Feature Importance'])


report_model(base_model)

              precision    recall  f1-score   support

      Adelie       0.93      0.98      0.95        41
   Chinstrap       0.95      0.87      0.91        23
      Gentoo       1.00      1.00      1.00        37

    accuracy                           0.96       101
   macro avg       0.96      0.95      0.95       101
weighted avg       0.96      0.96      0.96       101

	species	island	culmen_length_mm	culmen_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	MALE
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	FEMALE
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	FEMALE
3	Adelie	Torgersen	NaN	NaN	NaN	NaN	NaN
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	FEMALE

	sex	.	FEMALE	MALE
culmen_length_mm	count	1.0	58.000000	61.000000
	mean	44.5	45.563793	49.473770
	std	NaN	2.051247	2.720594
	min	44.5	40.900000	44.400000
	25%	44.5	43.850000	48.100000
	50%	44.5	45.500000	49.500000
	75%	44.5	46.875000	50.500000
	max	44.5	50.500000	59.600000
culmen_depth_mm	count	1.0	58.000000	61.000000
	mean	15.7	14.237931	15.718033
	std	NaN	0.540249	0.741060
	min	15.7	13.100000	14.100000
	25%	15.7	13.800000	15.200000
	50%	15.7	14.250000	15.700000
	75%	15.7	14.600000	16.100000
	max	15.7	15.500000	17.300000
flipper_length_mm	count	1.0	58.000000	61.000000
	mean	217.0	212.706897	221.540984
	std	NaN	3.897856	5.673252
	min	217.0	203.000000	208.000000
	25%	217.0	210.000000	218.000000
	50%	217.0	212.000000	221.000000
	75%	217.0	215.000000	225.000000
	max	217.0	222.000000	231.000000
body_mass_g	count	1.0	58.000000	61.000000
	mean	4875.0	4679.741379	5484.836066
	std	NaN	281.578294	313.158596
	min	4875.0	3950.000000	4750.000000
	25%	4875.0	4462.500000	5300.000000
	50%	4875.0	4700.000000	5500.000000
	75%	4875.0	4875.000000	5700.000000
	max	4875.0	5200.000000	6300.000000

Decision Tree¶

The Data¶

Feature Engineering¶

Building Decision Tree¶

	Feature Importance
culmen_length_mm	0.323520
culmen_depth_mm	0.053668
flipper_length_mm	0.542054
body_mass_g	0.002398
island_Dream	0.078360
island_Torgersen	0.000000
sex_MALE	0.000000